################################################################################
## Group Identities and Parliamentary Debates: Replication package
## Fiva, Nedregård and Øien (2024)

# Description:

## This files takes as input the estimated propensities and posteriors to
## calculate divergence of words according to equation 3.

################################################################################

## Packages

library(data.table)
library(stringr)
library(purrr)
library(dplyr)

## Directories

dir <- "../data/2_processed_data"
d.dir   <- "../data/3_model_output"

## Data

speaker_metadata <- fread(paste(dir, "speeches_session_lemma.csv", sep = "/"), encoding="UTF-8")


# I remove punctuation, which should not be included when counting words

speaker_metadata[, text := str_replace_all(text, "[[:punct:]]", "")]


## Functions


zeta_function <- function(group_indicator, Q, RHO){
  
  J <- ncol(Q) # Number of words
  
  q_real  <- Q[1:(nrow(Q)/2),] # real speakers
  q_clone <- Q[(nrow(Q)/2+1):nrow(Q),] # clone speakers
  
  rho <- RHO[1:(nrow(RHO)/2),] # rho is just duplicated
  
  ## Generating the propensities for R and D (using the same notation as in
  ## the equation in 6.1. Partisan phrases in Gentzkow et al. (2019), but R stands for in our case right-wing parties, female,
  ## old, and rural speakers)
  qR <- q_real*group_indicator + q_clone*(1-group_indicator)
  qD <- q_real*(1-group_indicator) + q_clone*group_indicator
  
  productR <- qR*rho # Propensity to hear word in columns times the posterior that the word is said by R.
  productD <- qD*rho
  rsumsR <- rowSums(productR) # This is the expected posterior for R
  rsumsD <- rowSums(productD) # This is the exptected posterior for D
  
  ## Matrices for expected posteriors
  
  mat_rsumsR <- matrix(rep(rsumsR, J), ncol = J) 
  mat_rsumsD <- matrix(rep(rsumsD, J), ncol = J)
  
  ## Remove the probability mass provided by word j from the total
  
  leaveoutR <- mat_rsumsR-productR
  leaveoutD <- mat_rsumsD-productD
  
  ## Scale it such that the probabilities sum to one after removing a word.
  
  leaveoutR_scaled <- leaveoutR/(1-qR)
  leaveoutD_scaled <- leaveoutD/(1-qD)
  
  ## Calculated the partisan phrases for each speaker
  
  change_in_expected_posterior_of_R <- 0.5 - 0.5*(leaveoutR_scaled + leaveoutD_scaled)
  
  zeta <- colMeans(change_in_expected_posterior_of_R)
  
  return(zeta)
  
}

divergent.50.word.table.function <- function(data, partisan_word_vec, indicator = "H"){
  
  # Finding the most divergence words
  
  partisan_word_vec_sorted <- sort(partisan_word_vec) # from negative to positive
  
  ## Words for indicator = 1 (the more positive the more divergence)
  
  top_word_1    <- partisan_word_vec_sorted[(length(partisan_word_vec_sorted)-49):length(partisan_word_vec_sorted)]
  
  top_word_1    <- sort(top_word_1, decreasing = T)
  names_top_word_1 <- names(top_word_1)
  
  ## Words for indicator = 0 (the more negative the more divergence)
  
  top_word_0 <- partisan_word_vec_sorted[1:50]
  names_top_word_0 <- names(top_word_0)
  
  # Counting the number of times the divergent words are used for indicator = 1
  # and indicator = 0
  setDT(data)
  dt <- data[, .(speech = paste(text, collapse = " ")), by = indicator]
  
  # Split text by words
  dt[, speech_vec := map(speech, \(x) str_split_1(x, pattern = " "))]
  
  # Table speeches to count number of times each word is used.
  dt[, speech_vec_tbl := map(speech_vec, \(x) table(x))]
  
  # total number words is the sum of the individual words
  dt[, count      := map_dbl(speech_vec_tbl, sum)]
  
  # Count number of times speakser say the divergence words for both groups
  
  list.top.1 <- list(names_top_word_1, names_top_word_1)
  list.top.0 <- list(names_top_word_0, names_top_word_0)
  
  dt[, `:=` (top_word_1 = list.top.1,
             top_word_0 = list.top.0)]
  
  # Do the calculations
  ## Indicator == 1
  dt[, top_word_count_1 :=  map2(speech_vec_tbl, top_word_1, \(x, y) x[which(names(x) %in% y)])]
  dt[, top_word_share_1 :=  map2(top_word_count_1, count, \(x,y) x/y*100000)]
  ## Indicator == 0
  dt[, top_word_count_0 :=  map2(speech_vec_tbl, top_word_0, \(x, y) x[which(names(x) %in% y)])]
  dt[, top_word_share_0 :=  map2(top_word_count_0, count, \(x,y) x/y*100000)]
  
  ## Sort according to degree of partisanship
  
  dt[, top_word_share_0 := map2(top_word_share_0, top_word_0, \(x,y) x[match(y, names(x))])]
  dt[, top_word_share_1 := map2(top_word_share_1, top_word_1, \(x,y) x[match(y, names(x))])]
  
  
  
  tbl <- data.frame(rank           = 1:50,
                    words_1        = names(dt[get(indicator) == 1,]$top_word_share_1[[1]]),
                    n_words_1_by_0 = ceiling(as.numeric((dt[get(indicator) == 0,]$top_word_share_1[[1]]))),
                    n_words_1_by_1 = ceiling(as.numeric((dt[get(indicator) == 1,]$top_word_share_1[[1]]))),
                    words_0        = names(dt[get(indicator) == 0,]$top_word_share_0[[1]]),
                    n_words_0_by_0 = ceiling(as.numeric((dt[get(indicator) == 0,]$top_word_share_0[[1]]))),
                    n_words_0_by_1 = ceiling(as.numeric((dt[get(indicator) == 1,]$top_word_share_0[[1]])))
  )
  
  names(tbl) <- str_replace_all(names(tbl), "1", indicator)
  
  
  return(tbl)
  
}

################################################################################

#########################Calculating Zeta, and keeping top 50 zeta words #######

################################################################################



######### BLOC #################################################


q   <- readRDS(paste(d.dir, "q_bloc_80_2021.rds", sep = "/"))

rho <- readRDS(paste(d.dir, "rho_bloc_80_2021.rds", sep = "/"))

speaker_metadata[, year := substr(session, 1, 4)]

speaker_metadata[, H := 1*(bloc == "H")]

## Make sure the data is organized according to rownames of q

bloc_indicator <- speaker_metadata[match(rownames(q[1:(nrow(q)/2),]), pid_session), .(pid_session, H)]

bloc_indicator <- speaker_metadata$H

## Calculate divergence

zeta_bloc <- zeta_function(group_indicator = bloc_indicator, Q = q, RHO = rho)

### Making table of divergence and word frequency by indicator

t_bloc <- divergent.50.word.table.function(data = speaker_metadata, 
                                           partisan_word_vec = zeta_bloc, 
                                           indicator = "H")

fwrite(t_bloc, file = paste(d.dir, "di_words_top50_bloc.csv", sep = "/"))

######### GENDER #################################################

## Data

q   <- readRDS(paste(d.dir, "q_gender_80_2021.rds", sep = "/"))

rho <- readRDS(paste(d.dir, "rho_gender_80_2021.rds", sep = "/"))

gender_indicator <- speaker_metadata$female

## Calculating divergence

zeta_gender <- zeta_function(group_indicator = gender_indicator, Q = q, RHO = rho)

## Making table of divergence and word frequency by indicator

t_gender <- divergent.50.word.table.function(data = speaker_metadata, 
                                             partisan_word_vec = zeta_gender, 
                                             indicator = "female")

fwrite(t_gender, file = paste(d.dir, "di_words_top50_gender.csv", sep = "/"))


######### AGE #################################################

## Data


q   <- readRDS(paste(d.dir, "q_age_80_2021.rds", sep = "/"))

rho <- readRDS(paste(d.dir, "rho_age_80_2021.rds", sep = "/"))

speaker_metadata[, old := case_when(age_cat == "old" ~ 1, T ~ 0)]

age_indicator <- speaker_metadata$old


## Calculating divergence

zeta_age <- zeta_function(group_indicator = age_indicator, Q = q, RHO = rho)



## Making table of divergence and word frequency by indicator


t_age <- divergent.50.word.table.function(data = speaker_metadata,
    partisan_word_vec = zeta_age, indicator = "old")

fwrite(t_age, file = paste(d.dir, "di_words_top50_age.csv", sep = "/"))


######### URBAN #################################################

## Data


q   <- readRDS(paste(d.dir, "q_town_80_2021.rds", sep = "/"))

rho <- readRDS(paste(d.dir, "rho_town_80_2021.rds", sep = "/"))

speaker_metadata[, urban := dplyr::if_else(town == "urban", 1, 0)] 


urban_indicator <- speaker_metadata$urban


## Calculating divergence

zeta_urban <- zeta_function(group_indicator = urban_indicator, Q = q, RHO = rho)

## Making table of divergence and word frequency by indicator

t_urban <- divergent.50.word.table.function(data = speaker_metadata,
                                            partisan_word_vec = zeta_urban, 
                                            indicator = "urban")

fwrite(t_urban, file = paste(d.dir, "di_words_top50_urban.csv", sep = "/"))


######### SOCIAL BACKGROUND #################################################

## Data

q   <- readRDS(paste(d.dir, "q_occ_80_2021.rds", sep = "/"))

rho <- readRDS(paste(d.dir, "rho_occ_80_2021.rds", sep = "/"))

speaker_metadata[, white := case_when(occupation == "white" ~ 1,
                                      is.na(occupation) ~ NA, T ~ 0)] 


white_indicator <- speaker_metadata[pid_session %in% rownames(q)]
white_indicator <- white_indicator[match(rownames(q[1:(nrow(q)/2),]), pid_session)]

white_indicator <- white_indicator$white


## Calculating divergence

zeta_background <- zeta_function(group_indicator = white_indicator, Q = q, RHO = rho)


## Making table of divergence and word frequency by indicator

t_background <- divergent.50.word.table.function(data = speaker_metadata[!is.na(white)], 
                                                 partisan_word_vec = zeta_background, 
                                                 indicator = "white")

fwrite(t_background, file = paste(d.dir, "di_words_top50_background.csv", sep = "/"))














